%load_ext nb_black
import pandas as pd
import sys
from sklearn.linear_model import LogisticRegression, LinearRegression
from collections import Counter, defaultdict
import numpy as np
import plotly.express as px
import warnings
from scipy.stats import spearmanr, kendalltau
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack, vstack
from scipy.special import logit, expit
results_data = pd.read_pickle("data/results.pkl")
display(sys.getsizeof(results_data) / 1024)
results_data = {
k: v
for k, v in results_data.items()
if all([t.get("mask", None) is not None for t in results_data[k]])
}
display(sys.getsizeof(results_data) / 1024)
288.0859375
144.09375
players_df = pd.DataFrame.from_dict(pd.read_pickle("data/players.pkl"), orient="index")
players_df.index = players_df.id
players_df.drop(["id"], axis=1, inplace=True)
tournaments_df = pd.DataFrame.from_dict(
pd.read_pickle("data/tournaments.pkl"), orient="index"
)
tournaments_df.index = tournaments_df.id
tournaments_df.drop(["id"], axis=1, inplace=True)
tournaments_df.dateStart = pd.to_datetime(tournaments_df.dateStart, utc=True)
tournaments_df.dateEnd = pd.to_datetime(tournaments_df.dateEnd, utc=True)
tournaments_df = tournaments_df.loc[[k for k, v in results_data.items() if len(v)]]
year_start_train, year_start_test, year_stop_test = ("2019", "2020", "2021")
tournaments_df_train = tournaments_df[
(tournaments_df.dateStart >= year_start_train)
& (tournaments_df.dateStart < year_start_test)
]
tournaments_df_test = tournaments_df[
(tournaments_df.dateStart >= year_start_test)
& (tournaments_df.dateStart < year_stop_test)
]
tournaments_df_train.shape, tournaments_df_test.shape
((673, 8), (167, 8))
max_mask_len = max(len(t["mask"]) for r in results_data.values() for t in r)
max_mask_len
500
results_data = {
i: results_data[i]
for i in tournaments_df[
(tournaments_df.dateStart >= year_start_train)
& (tournaments_df.dateStart < year_stop_test)
].index
}
display(sys.getsizeof(results_data) / 1024)
36.09375
assert len(results_data) == tournaments_df_train.shape[0] + tournaments_df_test.shape[0]
assert all(t.year == 2019 for t in tournaments_df_train.dateStart)
assert all(t.year == 2020 for t in tournaments_df_test.dateStart)
display(tournaments_df_train, players_df, next(iter(results_data.values()))[0])
| name | dateStart | dateEnd | type | season | orgcommittee | synchData | questionQty | |
|---|---|---|---|---|---|---|---|---|
| id | ||||||||
| 4772 | Синхрон северных стран. Зимний выпуск | 2019-01-05 16:00:00+00:00 | 2019-01-09 16:00:00+00:00 | {'id': 3, 'name': 'Синхрон'} | /seasons/52 | [{'id': 28379, 'name': 'Константин', 'patronym... | {'dateRequestsAllowedTo': '2019-01-09T23:59:59... | {'1': 12, '2': 12, '3': 12} |
| 4973 | Балтийский Берег. 3 игра | 2019-01-25 16:05:00+00:00 | 2019-01-29 16:00:00+00:00 | {'id': 3, 'name': 'Синхрон'} | /seasons/52 | [{'id': 23030, 'name': 'Марина', 'patronymic':... | {'dateRequestsAllowedTo': '2019-01-28T23:59:59... | {'1': 12, '2': 12, '3': 12} |
| 4974 | Балтийский Берег. 4 игра | 2019-03-01 16:05:00+00:00 | 2019-03-05 16:00:00+00:00 | {'id': 3, 'name': 'Синхрон'} | /seasons/52 | [{'id': 23030, 'name': 'Марина', 'patronymic':... | {'dateRequestsAllowedTo': '2019-03-04T23:59:59... | {'1': 12, '2': 12, '3': 12} |
| 4975 | Балтийский Берег. 5 игра | 2019-04-05 16:05:00+00:00 | 2019-04-09 16:00:00+00:00 | {'id': 3, 'name': 'Синхрон'} | /seasons/52 | [{'id': 23030, 'name': 'Марина', 'patronymic':... | {'dateRequestsAllowedTo': '2019-04-08T23:59:59... | {'1': 12, '2': 12, '3': 12} |
| 4986 | ОВСЧ. 6 этап | 2019-02-15 17:00:00+00:00 | 2019-02-19 17:00:00+00:00 | {'id': 3, 'name': 'Синхрон'} | /seasons/52 | [{'id': 59140, 'name': 'Борис', 'patronymic': ... | {'dateRequestsAllowedTo': '2019-02-19T23:59:59... | {'1': 12, '2': 12, '3': 12} |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6173 | Кубок Мэра Казани | 2019-12-15 07:00:00+00:00 | 2019-12-15 12:00:00+00:00 | {'id': 2, 'name': 'Обычный'} | /seasons/53 | [{'id': 33624, 'name': 'Ренат', 'patronymic': ... | None | {'1': 15, '2': 15, '3': 15, '4': 15} |
| 6191 | Всеармянский Интеллектуальный Фестиваль | 2019-12-22 09:00:00+00:00 | 2019-12-22 13:00:00+00:00 | {'id': 2, 'name': 'Обычный'} | /seasons/53 | [{'id': 19981, 'name': 'Сейран', 'patronymic':... | None | {'1': 12, '2': 12, '3': 12} |
| 6249 | Школьный синхрон-lite. Сезон 3 | 2019-08-31 21:05:00+00:00 | 2020-04-30 20:55:00+00:00 | {'id': 5, 'name': 'Общий зачёт'} | /seasons/53 | [{'id': 23740, 'name': 'Владимир', 'patronymic... | None | {'1': 36, '2': 36, '3': 36, '4': 36, '5': 36, ... |
| 6254 | Школьная лига | 2019-10-04 16:00:00+00:00 | 2020-03-22 16:00:00+00:00 | {'id': 5, 'name': 'Общий зачёт'} | /seasons/53 | [{'id': 39218, 'name': 'Владислав', 'patronymi... | None | {'1': 36, '2': 36, '3': 36, '4': 36, '5': 36, ... |
| 6255 | ОВСЧ | 2019-09-20 17:00:00+00:00 | 2020-02-19 20:59:00+00:00 | {'id': 5, 'name': 'Общий зачёт'} | /seasons/53 | [{'id': 32901, 'name': 'Наиль', 'patronymic': ... | None | {'1': 36, '2': 36, '3': 36, '4': 36, '5': 36, ... |
673 rows × 8 columns
| name | patronymic | surname | |
|---|---|---|---|
| id | |||
| 1 | Алексей | None | Абабилов |
| 10 | Игорь | Абалов | |
| 11 | Наталья | Юрьевна | Абалымова |
| 12 | Артур | Евгеньевич | Абальян |
| 13 | Эрик | Евгеньевич | Абальян |
| ... | ... | ... | ... |
| 224700 | Артём | Евгеньевич | Садов |
| 224701 | Даниил | Олегович | Трефилов |
| 224702 | Владимир | Араратович | Басенцян |
| 224703 | Руслан | Ринатович | Дауранов |
| 224704 | Александр | Викторович | Гапонов |
204063 rows × 3 columns
{'team': {'id': 45556,
'name': 'Рабочее название',
'town': {'id': 285, 'name': 'Санкт-Петербург'}},
'mask': '111111111011111110111111111100010010',
'current': {'name': 'Рабочее название',
'town': {'id': 285, 'name': 'Санкт-Петербург'}},
'questionsTotal': 28,
'synchRequest': {'id': 56392,
'venue': {'id': 3030, 'name': 'Санкт-Петербург'}},
'position': 1,
'controversials': [{'id': 91169,
'questionNumber': 15,
'answer': 'Мьёльнир',
'issuedAt': '2019-01-06T13:28:48+03:00',
'status': 'A',
'comment': '',
'resolvedAt': '2019-01-06T15:25:54+03:00',
'appealJuryComment': None}],
'flags': [],
'teamMembers': [{'flag': 'Б',
'usedRating': 13507,
'rating': 13507,
'player': {'id': 6212,
'name': 'Юрий',
'patronymic': 'Яковлевич',
'surname': 'Выменец'}},
{'flag': 'Б',
'usedRating': 10988,
'rating': 13185,
'player': {'id': 18332,
'name': 'Александр',
'patronymic': 'Витальевич',
'surname': 'Либер'}},
{'flag': 'Б',
'usedRating': 8534,
'rating': 12801,
'player': {'id': 18036,
'name': 'Михаил',
'patronymic': 'Ильич',
'surname': 'Левандовский'}},
{'flag': 'К',
'usedRating': 6401,
'rating': 12801,
'player': {'id': 22799,
'name': 'Сергей',
'patronymic': 'Игоревич',
'surname': 'Николенко'}},
{'flag': 'Б',
'usedRating': 4252,
'rating': 12757,
'player': {'id': 15456,
'name': 'Сергей',
'patronymic': 'Владимирович',
'surname': 'Коновалов'}},
{'flag': 'Б',
'usedRating': 2069,
'rating': 12416,
'player': {'id': 26089,
'name': 'Ирина',
'patronymic': 'Сергеевна',
'surname': 'Прокофьева'}}]}
players_games: dict[int, list[list[tuple[int, int]]]] = defaultdict(lambda: [])
used_cnt = 0
skiped_cnt = 0
for t_id in tournaments_df_train.index:
data = results_data[t_id]
answers_cnt = Counter([len(t["mask"]) for t in data])
if len(answers_cnt) != 1:
skiped_cnt += answers_cnt.total()
continue
used_cnt += answers_cnt.total()
results = np.zeros((len(data), answers_cnt.most_common(1)[0][0]), dtype=bool)
for idx, team in enumerate(data):
mask = list(map(int, team["mask"].replace("?", "1").replace("X", "0")))
results[idx] = np.array(mask, dtype=bool)
questions_power = 1 - results.mean(axis=0)
for idx, team in enumerate(data):
mask = list(map(int, team["mask"].replace("?", "1").replace("X", "0")))
results[idx] = np.array(mask, dtype=bool)
for u in team["teamMembers"]:
players_games[u["player"]["id"]].append(
list(zip(mask, list(questions_power)))
)
f"пропущено {skiped_cnt / used_cnt * 100:.0f}% данных из-за различной длины вопросов внутри соревнования"
'пропущено 11% данных из-за различной длины вопросов внутри соревнования'
Смотрим статистики игроков на основе команд, в которых они играли
# player = players_games[22799]
def plot_answers_for_player(player: list[list[tuple[int, int]]]):
display(
px.scatter(
x=[q[1] for g in player for q in g],
y=[q[0] for g in player for q in g],
title="Игрок",
).update_layout(
xaxis_title="Сложность вопроса",
yaxis_title="Правильный/неправильный ответ",
)
)
display(
px.histogram(
x=[q[1] for g in player for q in g if q[0] == 1],
nbins=20,
title="распределение правильных ответов",
).update_layout(
xaxis_title="Сложность вопроса",
yaxis_title="количество правильных ответов",
)
)
display(
px.histogram(
x=[q[1] for g in player for q in g if q[0] == 0],
nbins=20,
title="распределение неправильных ответов",
).update_layout(
xaxis_title="Сложность вопроса",
yaxis_title="количество неправильных ответов",
)
)
plot_answers_for_player(players_games[22799])
plot_answers_for_player(players_games[87797])
plot_answers_for_player(players_games[87509])
def player_stats(player):
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=RuntimeWarning)
corr = np.mean([q[1] for g in player for q in g if q[0] == 1])
incorr = np.mean([q[1] for g in player for q in g if q[0] == 0])
if np.isnan(corr):
corr = 0
if np.isnan(incorr):
incorr = 0
return corr, incorr
(
player_stats(players_games[22799]),
player_stats(players_games[87797]),
player_stats(players_games[87509]),
)
((0.4975770084453995, 0.8328454403839074), (0.43555547069053063, 0.7163843381518152), (0.42532894807533034, 0.7055017639185375))
features = 40
def features_for_player(player):
return np.hstack(
(
np.histogram(
[q[1] for g in player for q in g if q[0] == 1],
bins=features // 2,
range=(0, 1),
density=False,
)[0],
np.histogram(
[q[1] for g in player for q in g if q[0] == 0],
bins=features // 2,
range=(0, 1),
density=False,
)[0],
)
)
display(px.line(features_for_player(players_games[22799])))
display(px.line(features_for_player(players_games[87797])))
display(px.line(features_for_player(players_games[87509])))
Статистики команд на основе статистик игроков, которые входят в эту команду
def team_features(players):
team = np.empty((len(players), features))
for idx, player in enumerate(players):
team[idx] = features_for_player(player)
return team.mean(axis=0)
display(
px.line(
team_features(
[players_games[22799], players_games[87797], players_games[87509]]
)
)
)
display(
px.line(
team_features([players_games[14518], players_games[12770], players_games[6064]])
)
)
def team_stats(players):
if len(players) == 0:
return (0, 0)
if isinstance(players[0], int):
players = [players_games[p] for p in players]
team_corr = []
team_incorr = []
for player in players:
corr, incorr = player_stats(player)
team_corr.append(corr)
team_incorr.append(incorr)
return np.mean(team_corr), np.mean(team_incorr)
(
team_stats([players_games[22799], players_games[87797], players_games[87509]]),
team_stats([22799, 87797, 87509]),
team_stats([players_games[14518], players_games[12770], players_games[6064]]),
)
((0.45282047573708684, 0.7515771808180866), (0.45282047573708684, 0.7515771808180866), (0.4615331484659509, 0.7518183967285608))
Ресзультат
def compute_metrics(df):
metric_spearman = []
metric_kendall = []
for tour_id, _ in df.iterrows():
check = [
np.mean(team_stats([p["player"]["id"] for p in t["teamMembers"]]))
for t in results_data[tour_id]
]
# display(px.line(y=check))
spearman = spearmanr(check, -np.arange(len(check))).correlation
kendall = kendalltau(check, -np.arange(len(check))).correlation
if np.isnan(spearman):
spearman = 0
if np.isnan(kendall):
kendall = 0
metric_spearman.append(spearman)
metric_kendall.append(kendall)
metric_spearman = np.mean(metric_spearman)
metric_kendall = np.mean(metric_kendall)
return metric_spearman, metric_kendall
print("Train:", compute_metrics(tournaments_df_train))
print("Test:", compute_metrics(tournaments_df_test))
Train: (0.7017189636014557, 0.542864839188076) Test: (0.6342618710526812, 0.47879146683885)
Это было решение через статистики, которое, к сожалению, не требуется в данной задаче. Поэтому, далее надо построить матрицу для M шага.
players_games_df = []
for t_id in tournaments_df_train.index:
for team in results_data[t_id]:
mask = list(map(int, team["mask"].replace("?", "1").replace("X", "0")))
for player in team["teamMembers"]:
for question_idx, answer in enumerate(mask):
players_games_df.append(
(
t_id * max_mask_len * 2 + question_idx, # question_id
player["player"]["id"], # player_id
team["team"]["id"], # team_id
answer, # answer
)
)
players_games_df = np.array(players_games_df)
players_games_df.shape
21066091
Далее делается датасет для предсказания вероятности ответа на вопрос
(players_games_df[0], players_games_df[100])
(array([4772000, 6212, 45556, 1]), array([4772028, 18036, 45556, 0]))
question_enc = OneHotEncoder()
player_enc = OneHotEncoder()
X = hstack(
(
player_enc.fit_transform(players_games_df[:, 1].reshape(-1, 1)),
question_enc.fit_transform(players_games_df[:, 0].reshape(-1, 1)),
)
)
X.shape
(21066091, 92270)
baseline_model = LogisticRegression(random_state=0, class_weight="balanced", solver="liblinear")
baseline_model.fit(X, players_games_df[:, 3])
baseline_model.score(X, players_games_df[:, 3])
0.7590245860041144
assert (
baseline_model.coef_.shape[1]
== player_enc.categories_[0].shape[0] + question_enc.categories_[0].shape[0]
)
players_enc_idx = (
player_enc.transform(np.array(list(players_games.keys())).reshape(-1, 1))
.argmax(axis=1)
.flatten()
)
players_proba = 1 / (
1
+ np.exp(
-baseline_model.intercept_
+ baseline_model.coef_[0, : player_enc.categories_[0].shape[0]][players_enc_idx]
)
)
players_proba = {p_id: pp for p_id, pp in zip(players_games.keys(), players_proba[0])}
def team_p(team, players_proba):
result = 1
for player in team["teamMembers"]:
result *= 1 - players_proba.get(player["player"]["id"], 0)
return 1 - result
def compute_metrics(df, players_proba):
metric_spearman = []
metric_kendall = []
for tour_id, _ in df.iterrows():
check = [team_p(t, players_proba) for t in results_data[tour_id]]
spearman = spearmanr(check, np.arange(len(check))).correlation
kendall = kendalltau(check, np.arange(len(check))).correlation
if np.isnan(spearman):
spearman = 0
if np.isnan(kendall):
kendall = 0
metric_spearman.append(spearman)
metric_kendall.append(kendall)
break
metric_spearman = np.mean(metric_spearman)
metric_kendall = np.mean(metric_kendall)
return metric_spearman, metric_kendall
print("Train:", compute_metrics(tournaments_df_train, players_proba))
print("Test:", compute_metrics(tournaments_df_test, players_proba))
Train: (0.6665696372592925, 0.503707886316582) Test: (0.6359447004608295, 0.48924988055422836)
Далее вводим скрытую переменную и запускаем итерации
eps = 1e-5
def em(p):
for _ in range(3):
model.fit(X, logit(p))
players_proba = 1 / (
1
+ np.exp(
-model.intercept_
+ model.coef_[: player_enc.categories_[0].shape[0]][players_enc_idx]
)
)
players_proba = {
p_id: pp for p_id, pp in zip(players_games.keys(), players_proba[0])
}
print(compute_metrics(tournaments_df_test, players_proba))
p = expit(model.predict(X))
questions = defaultdict(list)
curr_team = players_games_df[0, 2]
prev_idx = 0
z = []
for idx, (r, curr_p) in enumerate(zip(players_games_df, p)):
if r[2] != curr_team:
for q in questions.keys():
questions[q] = np.prod(questions[q])
for i in range(prev_idx, idx):
z.append(questions[players_games_df[i, 0]])
questions = defaultdict(list)
curr_team = r[2]
prev_idx = idx
questions[r[0]].append(1 - curr_p)
for q in questions.keys():
questions[q] = np.prod(questions[q])
for i in range(prev_idx, idx + 1):
z.append(questions[players_games_df[i, 0]])
z = np.array(z)
z = p / (1 - z)
z = z * players_games_df[:, -1]
p = np.clip(z, eps, 1 - eps)
model = LinearRegression()
initial_p = baseline_model.predict_proba(X)[:, 1] * players_games_df[:, -1]
em(np.clip(initial_p, eps, 1 - eps))
0.6791244522801549 0.5109023904505778 0.6825513414599562 0.5149939064122251 0.6827447215189244 0.515333255453505